Data Preparation

Top Companies Prep

Bring top companies into dataframe.

#names(cmspayments)
#Companies is "Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name"
toppayor <- aggregate(Total_Amount_of_Payment_USDollars ~ Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name
                                                        
                      ,cmspayments,FUN = sum)
colnames(toppayor) <- c("Company","TotalPayment")
toppayor <- head(toppayor[order(toppayor$TotalPayment, decreasing= TRUE),], n = 10)
names(cmspayments)
 [1] "X"                                                                 "Change_Type"                                                       "Covered_Recipient_Type"                                           
 [4] "Teaching_Hospital_CCN"                                             "Teaching_Hospital_ID"                                              "Teaching_Hospital_Name"                                           
 [7] "Physician_Profile_ID"                                              "Physician_First_Name"                                              "Physician_Middle_Name"                                            
[10] "Physician_Last_Name"                                               "Physician_Name_Suffix"                                             "Recipient_Primary_Business_Street_Address_Line1"                  
[13] "Recipient_Primary_Business_Street_Address_Line2"                   "Recipient_City"                                                    "Recipient_State"                                                  
[16] "Recipient_Zip_Code"                                                "Recipient_Country"                                                 "Recipient_Province"                                               
[19] "Recipient_Postal_Code"                                             "Physician_Primary_Type"                                            "Physician_Specialty"                                              
[22] "Physician_License_State_code1"                                     "Physician_License_State_code2"                                     "Physician_License_State_code3"                                    
[25] "Physician_License_State_code4"                                     "Physician_License_State_code5"                                     "Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name"        
[28] "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID"       "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name"     "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State"   
[31] "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country"  "Total_Amount_of_Payment_USDollars"                                 "Date_of_Payment"                                                  
[34] "Number_of_Payments_Included_in_Total_Amount"                       "Form_of_Payment_or_Transfer_of_Value"                              "Nature_of_Payment_or_Transfer_of_Value"                           
[37] "City_of_Travel"                                                    "State_of_Travel"                                                   "Country_of_Travel"                                                
[40] "Physician_Ownership_Indicator"                                     "Third_Party_Payment_Recipient_Indicator"                           "Name_of_Third_Party_Entity_Receiving_Payment_or_Transfer_of_Value"
[43] "Charity_Indicator"                                                 "Third_Party_Equals_Covered_Recipient_Indicator"                    "Contextual_Information"                                           
[46] "Delay_in_Publication_Indicator"                                    "Record_ID"                                                         "Dispute_Status_for_Publication"                                   
[49] "Product_Indicator"                                                 "Name_of_Associated_Covered_Drug_or_Biological1"                    "Name_of_Associated_Covered_Drug_or_Biological2"                   
[52] "Name_of_Associated_Covered_Drug_or_Biological3"                    "Name_of_Associated_Covered_Drug_or_Biological4"                    "Name_of_Associated_Covered_Drug_or_Biological5"                   
[55] "NDC_of_Associated_Covered_Drug_or_Biological1"                     "NDC_of_Associated_Covered_Drug_or_Biological2"                     "NDC_of_Associated_Covered_Drug_or_Biological3"                    
[58] "NDC_of_Associated_Covered_Drug_or_Biological4"                     "NDC_of_Associated_Covered_Drug_or_Biological5"                     "Name_of_Associated_Covered_Device_or_Medical_Supply1"             
[61] "Name_of_Associated_Covered_Device_or_Medical_Supply2"              "Name_of_Associated_Covered_Device_or_Medical_Supply3"              "Name_of_Associated_Covered_Device_or_Medical_Supply4"             
[64] "Name_of_Associated_Covered_Device_or_Medical_Supply5"              "Program_Year"                                                      "Payment_Publication_Date"                                         
[67] "PhysSpcOne"                                                        "PhysSpcTwo"                                                        "PhysSpcThree"                                                     

Simplify the data manipulation

Using RSQLite, use SQL to simplify data manipulation.

Using the top companies dataframe, write to table.

rsqlcon <- dbConnect(RSQLite::SQLite(), ":memory:")

dbWriteTable(conn = rsqlcon
             ,"cmspaytable"
             ,cmspayments
             ,overwrite = TRUE)

topCompany <- RSQLite::dbGetQuery(conn = rsqlcon
                                ,"select Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name as Company
                                        ,sum(Total_Amount_of_Payment_USDollars) as TotalPayments
                                  from cmspaytable
                                  group by 1
                                  order by 2 desc
                                  limit 10"
                                  )
##has top company by revenue
dbWriteTable(conn = rsqlcon
             ,"topcompanytable"
             ,topCompany
             ,overwrite = TRUE)
print(topCompany)

Create a dataset with company, payment specialty, and total payments. Group by the attributes and sum the measures. Then get the top companies with specialty only if they exist in the top 10 companies data frame.

topspecialty <- RSQLite::dbGetQuery(conn = rsqlcon
                    , "select c.Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name as Company
                             ,c.Physician_Specialty
                             ,sum(c.Total_Amount_of_Payment_USDollars) as TotalPayments
                       from cmspaytable as c inner join (select distinct Company
                                          from topcompanytable
                                          order by topcompanytable.TotalPayments desc
                                          limit 5
                                          ) as tt on c.Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name
                                                   = tt.Company
                       group by 1,2
                       order by 3 desc,2,1
                    ")
topspecialty$Physician_Specialty <- within(topspecialty, X <- ifelse(nchar(topspecialty$Physician_Specialty) == "0"
                                                          , "NA"
                                                          ,topspecialty$Physician_Specialty)
                                          )
                      
print(topspecialty)

Visualize

  1. Top 10 companies barchart
  2. Top 5 companies

#plot_ly(x = c(topCompany$Company)
#       ,y = c(topCompany$TotalPayments)
#       ,type = "bar")
#par(mfrow - c(2,2))
g <- ggplot(topCompany,aes(Company,TotalPayments/1000000))
g+geom_bar(stat = "identity"
           ,colour = "dark blue"
           ,fill = "dodger blue") +theme(axis.text.x = element_text(angle=45
                                                                ,hjust = 1)
                                     ,axis.text.y = element_text(angle = 45)
                                     )+ labs(y = "Total Payments")+geom_line()+
                geom_hline(yintercept = mean(topCompany$TotalPayments/1000000), color="blue")
summa

### use ping to produce image and control size.
specialtytree <- file.path(getwd(),paste("specialtytreemap",".png",sep = ""))
png(filename = specialtytree,
    width = 1000, height = 850, units = "px", pointsize = 12,
    bg = "white",  res = NA,## ...,
    ##type = c("cairo", "cairo-png", "Xlib", "quartz"), 
    antialias = c("default")
)



treemap::treemap(dtf = topspecialty, index = c("Company","Physician_Specialty")
                 ,type = "index"
                 , fontsize.labels=c(12,10,10)
                 ,vSize = "TotalPayments"
                 ,vColor = "Physician_Specialty"
                 ,command.line.output = FALSE
                 #,fontface.labels = "bold"
                 ,palette = "Set1"
                 ,fun.aggregate = "sum"
                 , align.labels=list(
                                      c("center", "center"), 
                                      c("center", "bottom")
                                      )
                  )
dev.off()
Specialty Tree Map

Specialty Tree Map

---
title: "Compliance Analytics Open Payments Case "
author: "Michael Garcia"
date: "May 28, 2019"
output:
  html_notebook: default
  pdf_document: default
  word_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r loadpkg, include=TRUE,echo=FALSE, message=FALSE}
library(RSQLite)
library(DBI)
library(tidyr)
library(ggplot2)
#library(plotly)
```

## Data Preparation

```{r fetchdata, include=TRUE,echo=FALSE, message=FALSE}
cwd <- paste0(getwd())
setwd(cwd)

download.file(url = "https://s3.amazonaws.com/istreet-assets/i0jhsz_Udi_Zl2jxJ-c-pQ/CMS_OPEN_PAYMENTS_DEC_2013.zip"
              ,destfile = "CMS_OPEN_PAYMENTS_DEC_2013.zip")


cmszip <- unzip("CMS_OPEN_PAYMENTS_DEC_2013.zip")
cmspayments <- read.csv(cmszip)
### physician specialty has nas
cmspayments$Physician_Specialty <-  as.character(cmspayments$Physician_Specialty)
cmspayments$Physician_Specialty <- ifelse(nchar(cmspayments$Physician_Specialty)== 0
                                                , "NA"
                                                ,cmspayments$Physician_Specialty)
```


### Top Companies Prep
Bring top companies into dataframe.

```{r toppayer, echo=TRUE,include=TRUE}
#names(cmspayments)
#Companies is "Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name"
toppayor <- aggregate(Total_Amount_of_Payment_USDollars ~ Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name
                                                        
                      ,cmspayments,FUN = sum)
colnames(toppayor) <- c("Company","TotalPayment")
toppayor <- head(toppayor[order(toppayor$TotalPayment, decreasing= TRUE),], n = 10)
```


```{r physspecial_split,echo=TRUE, include=TRUE}

#within(cmspayments, FOO<-data.frame(do.call('rbind', strsplit(as.character(cmspayments$Physician_Specialty), '|', fixed=TRUE))))
physplit <- data.frame(do.call('rbind', strsplit(as.character(cmspayments$Physician_Specialty),'|',fixed=TRUE)))
colnames(physplit) <- c("PhysSpcOne","PhysSpcTwo","PhysSpcThree")

```


### Simplify the data manipulation
Using RSQLite, use SQL to simplify data manipulation.

Using the top companies dataframe, write to table. 
```{r sqltransform, echo=TRUE, include=TRUE}
rsqlcon <- dbConnect(RSQLite::SQLite(), ":memory:")

dbWriteTable(conn = rsqlcon
             ,"cmspaytable"
             ,cmspayments
             ,overwrite = TRUE)

topCompany <- RSQLite::dbGetQuery(conn = rsqlcon
                                ,"select Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name as Company
                                        ,sum(Total_Amount_of_Payment_USDollars) as TotalPayments
                                  from cmspaytable
                                  group by 1
                                  order by 2 desc
                                  limit 10"
                                  )
##has top company by revenue
dbWriteTable(conn = rsqlcon
             ,"topcompanytable"
             ,topCompany
             ,overwrite = TRUE)
print(topCompany)
```



Create a dataset with company, payment specialty, and total payments. Group by the attributes and sum the measures.
Then get the top companies with specialty only if they exist in the top 10 companies data frame.


```{r topspecialty, echo= TRUE, include= TRUE}
topspecialty <- RSQLite::dbGetQuery(conn = rsqlcon
                    , "select c.Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name as Company
                             ,c.Physician_Specialty
                             ,sum(c.Total_Amount_of_Payment_USDollars) as TotalPayments
                       from cmspaytable as c inner join (select distinct Company
                                          from topcompanytable
                                          order by topcompanytable.TotalPayments desc
                                          limit 5
                                          ) as tt on c.Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name
                                                   = tt.Company
                       group by 1,2
                       order by 3 desc,2,1
                    ")
topspecialty$Physician_Specialty <- within(topspecialty, X <- ifelse(nchar(topspecialty$Physician_Specialty) == "0"
                                                          , "NA"
                                                          ,topspecialty$Physician_Specialty)
                                          )
                      
print(topspecialty)
```

## Visualize
1. Top 10 companies barchart
2. Top 5 companies


```{r visualize,include=TRUE, echo=TRUE}

#plot_ly(x = c(topCompany$Company)
#       ,y = c(topCompany$TotalPayments)
#       ,type = "bar")
#par(mfrow - c(2,2))
g <- ggplot(topCompany,aes(Company,TotalPayments/1000000))
g+geom_bar(stat = "identity"
           ,colour = "dark blue"
           ,fill = "dodger blue") +theme(axis.text.x = element_text(angle=45
                                                                ,hjust = 1)
                                     ,axis.text.y = element_text(angle = 45)
                                     )+ labs(y = "Total Payments")+geom_line()+
                geom_hline(yintercept = mean(topCompany$TotalPayments/1000000), color="blue")
```

```{r topspecialtysummary, echo=TRUE,include=TRUE}
summa
```


```{r visualizespecialty_copy, echo=FALSE,include=FALSE}
#plot_ly(x = c(topCompany$Company)
#       ,y = c(topCompany$TotalPayments)
#       ,type = "bar")
#par(mfrow - c(2,2))
ts <- aggregate(TotalPayments~Physician_Specialty,topspecialty,sum)


b <- ggplot(topspecialty,aes(Company,TotalPayments/1000000
                           ,fill = rep(unlist(Physician_Specialty)))
            )

b+geom_bar(stat = "identity"
           ,colour = "dark blue"
           ,fill = "dodger blue") +theme(axis.text.x = element_text(angle=45
                                                                ,hjust = 1)
                                     ,axis.text.y = element_text(angle = 45)
                                     )+ labs(y = "Total Payments")
```

```{r treemap, echo=TRUE,include=TRUE}

### use ping to produce image and control size.
specialtytree <- file.path(getwd(),paste("specialtytreemap",".png",sep = ""))
png(filename = specialtytree,
    width = 1000, height = 850, units = "px", pointsize = 12,
    bg = "white",  res = NA,## ...,
    ##type = c("cairo", "cairo-png", "Xlib", "quartz"), 
    antialias = c("default")
)



treemap::treemap(dtf = topspecialty, index = c("Company","Physician_Specialty")
                 ,type = "index"
                 , fontsize.labels=c(12,10,10)
                 ,vSize = "TotalPayments"
                 ,vColor = "Physician_Specialty"
                 ,command.line.output = FALSE
                 #,fontface.labels = "bold"
                 ,palette = "Set1"
                 ,fun.aggregate = "sum"
                 , align.labels=list(
                                      c("center", "center"), 
                                      c("center", "bottom")
                                      )
                  )
dev.off()
```

![Specialty Tree Map](specialtytreemap.png)








